{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dce05fb3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d584cb969c624473bc182df104d043cc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)lve/main/config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "40989b29e06243f992bf67aba604d3f8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e0755cbbb73a4d3fa06963aaa77d6e56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3b25660e88c94980b9c513f8f4bf3924",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4fe6f5057548441ea98f1876672ab51c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "291e9ecbbbad46a1bb0dc702e9f54b35",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "937d3dc7996e41439664088d484542dd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)rocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.\n"
     ]
    }
   ],
   "source": [
    "# Use a pipeline as a high-level helper\n",
    "from transformers import pipeline\n",
    "\n",
    "pipe = pipeline(\"visual-question-answering\", model=\"dandelin/vilt-b32-finetuned-vqa\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "26bb5a1e",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'question'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m pipe({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimage\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://free-images.com/md/7687/blue_jay_bird_nature.jpg\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      2\u001b[0m       \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestioin\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhat\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms the animal in the image?\u001b[39m\u001b[38;5;124m\"\u001b[39m})\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py:109\u001b[0m, in \u001b[0;36mVisualQuestionAnsweringPipeline.__call__\u001b[0;34m(self, image, question, **kwargs)\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;124;03m    Supports the following format\u001b[39;00m\n\u001b[1;32m    104\u001b[0m \u001b[38;5;124;03m    - {\"image\": image, \"question\": question}\u001b[39;00m\n\u001b[1;32m    105\u001b[0m \u001b[38;5;124;03m    - [{\"image\": image, \"question\": question}]\u001b[39;00m\n\u001b[1;32m    106\u001b[0m \u001b[38;5;124;03m    - Generator and datasets\u001b[39;00m\n\u001b[1;32m    107\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m    108\u001b[0m     inputs \u001b[38;5;241m=\u001b[39m image\n\u001b[0;32m--> 109\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m(inputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/transformers/pipelines/base.py:1119\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[0;34m(self, inputs, num_workers, batch_size, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1111\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mnext\u001b[39m(\n\u001b[1;32m   1112\u001b[0m         \u001b[38;5;28miter\u001b[39m(\n\u001b[1;32m   1113\u001b[0m             \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_iterator(\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1116\u001b[0m         )\n\u001b[1;32m   1117\u001b[0m     )\n\u001b[1;32m   1118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1119\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun_single(inputs, preprocess_params, forward_params, postprocess_params)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/transformers/pipelines/base.py:1125\u001b[0m, in \u001b[0;36mPipeline.run_single\u001b[0;34m(self, inputs, preprocess_params, forward_params, postprocess_params)\u001b[0m\n\u001b[1;32m   1124\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun_single\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs, preprocess_params, forward_params, postprocess_params):\n\u001b[0;32m-> 1125\u001b[0m     model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpreprocess(inputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpreprocess_params)\n\u001b[1;32m   1126\u001b[0m     model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mforward(model_inputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mforward_params)\n\u001b[1;32m   1127\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpostprocess(model_outputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpostprocess_params)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py:115\u001b[0m, in \u001b[0;36mVisualQuestionAnsweringPipeline.preprocess\u001b[0;34m(self, inputs, padding, truncation)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpreprocess\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, truncation\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m    113\u001b[0m     image \u001b[38;5;241m=\u001b[39m load_image(inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimage\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m    114\u001b[0m     model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtokenizer(\n\u001b[0;32m--> 115\u001b[0m         inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m], return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework, padding\u001b[38;5;241m=\u001b[39mpadding, truncation\u001b[38;5;241m=\u001b[39mtruncation\n\u001b[1;32m    116\u001b[0m     )\n\u001b[1;32m    117\u001b[0m     image_features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_processor(images\u001b[38;5;241m=\u001b[39mimage, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework)\n\u001b[1;32m    118\u001b[0m     model_inputs\u001b[38;5;241m.\u001b[39mupdate(image_features)\n",
      "\u001b[0;31mKeyError\u001b[0m: 'question'"
     ]
    }
   ],
   "source": [
    "pipe({\"image\":\"https://free-images.com/md/7687/blue_jay_bird_nature.jpg\",\n",
    "      \"question\":\"What's the animal in the image?\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db5d4b9f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
